setwd("/Users/vh3/Documents/MCA/ANALYSIS_3")

#load required packages
require("Matrix")
library(scater, quietly = TRUE)
require("SingleCellExperiment")
options(stringsAsFactors = FALSE)
library(plotly)
molecules <- read.table("allcounts4.csv", header = TRUE, sep = ",", row.names=1, stringsAsFactors = TRUE)
anno <- read.delim("allpheno8.2.csv", header = TRUE, sep = ",")

anno$is_control <- rep("FALSE", length(anno$Name_of_Sample))
anno[which(anno$Name_of_Sample != "SC"), ]$is_control <- "TRUE"
anno[which(anno$Plate=="core"), ]$Sortdate <- "Fall2017"


anno$col <- rep("violet", length(anno$sample_id))
anno[which(anno$ShortenedLifeStage=="bbSpz"), ]$col <- "navy"
anno[which(anno$ShortenedLifeStage=="ooSpz"), ]$col <- "lightskyblue"
anno[which(anno$ShortenedLifeStage2=="Female"), ]$col <- "purple4"
anno[which(anno$ShortenedLifeStage2=="Male"), ]$col <- "purple"
anno[which(anno$ShortenedLifeStage=="sgSpz"), ]$col <- "royalblue"
anno[which(anno$ShortenedLifeStage=="EEF"), ]$col <- "darkorange"
anno[which(anno$ShortenedLifeStage=="ook"), ]$col <- "turquoise4"
anno[which(anno$ShortenedLifeStage=="oocyst"), ]$col <- "steelblue"
anno[which(anno$ShortenedLifeStage=="Ring"), ]$col <- "hotpink"
anno[which(anno$ShortenedLifeStage=="Merozoite"), ]$col <- "lightpink"
anno[which(anno$ShortenedLifeStage2=="Trophozoite"), ]$col <- "violet"
anno[which(anno$ShortenedLifeStage=="ookoo"), ]$col <- "mediumturquoise"

cellnames <- colnames(molecules)
anno <- anno[match(cellnames, anno$sample_id), ]

cols <- c("bbSpz" = "navy", "EEF"="darkorange", "Merozoite"="lightpink", "oocyst"="steelblue", "ook" = "turquoise4", "ooSpz" = "lightskyblue", "Ring"="hotpink", "sgSpz"= "royalblue", "Schizont" = "violet", "Male"="purple", "Female"="purple4", "ookoo" = "mediumturquoise", "Trophozoite"="violet")

knitr::kable(
  head(molecules[ , 1:3]), booktabs = TRUE,
  caption = 'A table of the first 6 rows and 3 columns of the molecules table.'
)
A table of the first 6 rows and 3 columns of the molecules table.
oocyst_1 oocyst_2 oocyst_3
PBANKA_0000101 0 0 0
PBANKA_0000201 0 0 0
PBANKA_0000301 0 0 0
PBANKA_0000401 0 0 0
PBANKA_0000600 0 0 0
PBANKA_0000701 0 0 0
knitr::kable(
  head(anno[ , 1:3]), booktabs = TRUE,
  caption = 'A table of the first 6 rows and 3 columns of the molecules table.'
)
A table of the first 6 rows and 3 columns of the molecules table.
sample_id sample_name_for_seq npgnum
oocyst_1 9A_oocyst2_SC_Pbe_oocyst 1
oocyst_2 2B_oocyst1_SC_Pbe_oocyst 2
oocyst_3 3B_oocyst1_SC_Pbe_oocyst 3
oocyst_4 4B_oocyst1_SC_Pbe_oocyst 4
oocyst_5 5B_oocyst1_SC_Pbe_oocyst 5
oocyst_6 6B_oocyst1_SC_Pbe_oocyst 6

make MCA object

# Set up objects

mca <- SingleCellExperiment(assays = list(
  counts = as.matrix(molecules),
  logcounts = log2(as.matrix(molecules) + 1)
), colData = anno)

# Set up objects


mca[which(mca$Plate=="core"), ]$Sortdate <- "Fall2017"

mca <- mca[, (colData(mca)$Sortdate != "19.12.2017")] #Removing first rep of bbSPZ because of low genes per cell see Removingbadbbspz_20180424.Rmd
anno2 <- droplevels(subset(anno, Sortdate != "19.12.2017"))

#remove ooSpz
mca <- mca[, (colData(mca)$ShortenedLifeStage != "ooSpz")]
anno2 <- droplevels(subset(anno2, ShortenedLifeStage != "ooSpz"))

#remove ookinetes clusters that are not expressing CTRP
mca <- mca[, (colData(mca)$trueook == "include")]
anno2 <- droplevels(subset(anno2, trueook == "include"))

#Remove asexuals that didn't get classified because they don't pass QC (57 cells)
mca <- mca[, (colData(mca)$ShortenedLifeStage2 != "Shz")]

# define feature names in feature_symbol column
rowData(mca)$feature_symbol <- rownames(mca)
saveRDS(mca, "Pbmca_20180625.rds")
#calculate the quality metrics
mca <- calculateQCMetrics(mca)

#Inspect poor quality cells


tab <- as.data.frame(colData(mca))
ggplot(tab, aes(x=total_features, fill = ShortenedLifeStage2)) + geom_histogram(binwidth = 50) + facet_grid(ShortenedLifeStage2~., scales="free")

ggplot(tab, aes(x=ShortenedLifeStage, total_features)) + geom_violin() +  coord_flip() + stat_summary(fun.y = median, geom = "point", size=2, color="red") + theme_minimal()

tapply(tab$total_counts, tab$ShortenedLifeStage2, median)
##       bbSpz         EEF      Female        Male   Merozoite      oocyst 
##     69433.5    195956.0    249797.0    212727.0    243566.5     63302.0 
##         ook       ookoo        Ring    Schizont       sgSpz Trophozoite 
##    479809.0    289963.5    462611.0    333244.5     60599.5    250863.0
tf <- tapply(tab$total_features, tab$ShortenedLifeStage2, median)
tf
##       bbSpz         EEF      Female        Male   Merozoite      oocyst 
##       859.5      2977.0      2527.0      1576.0       202.0      2995.0 
##         ook       ookoo        Ring    Schizont       sgSpz Trophozoite 
##      1558.0      3088.0       392.5       938.0       419.0      2225.0
#write.csv(tf, file="rawmediantotalfeatures.csv")

QC based on txnal profile

mcasmall <- mca[, (colData(mca)$ShortenedLifeStage2 == "Merozoite") | (colData(mca)$ShortenedLifeStage2 == "ooSpz") | (colData(mca)$ShortenedLifeStage2 == "Ring") | (colData(mca)$ShortenedLifeStage2 == "sgSpz") ]
mcamedium <- mca[, (colData(mca)$ShortenedLifeStage2 == "bbSpz") | (colData(mca)$ShortenedLifeStage2 == "Schizont") | (colData(mca)$ShortenedLifeStage2 == "Trophozoite")]
mcalarge <- mca[, (colData(mca)$ShortenedLifeStage2 == "EEF") | (colData(mca)$ShortenedLifeStage2 == "Female") | (colData(mca)$ShortenedLifeStage2 == "Male") | (colData(mca)$ShortenedLifeStage2 == "ook") | (colData(mca)$ShortenedLifeStage2 == "ookoo") | (colData(mca)$ShortenedLifeStage2 == "oocyst")]



############MCA SMALL
# Filter cells with low counts
filter_by_total_counts <- (mcasmall$total_counts > 400)
table(filter_by_total_counts)
## filter_by_total_counts
## FALSE  TRUE 
##   102   666
# Filter cells with low numbers of features detected
filter_by_expr_features <- (mcasmall$total_features > 40)
table(filter_by_expr_features)
## filter_by_expr_features
## FALSE  TRUE 
##    75   693
# filter out control samples

filter_by_control <- colData(mcasmall)$is_control == FALSE
table(colData(mcasmall)$is_control, colData(mcasmall)$ShortenedLifeStage2)
##        
##         Merozoite Ring sgSpz
##   FALSE       249  282   188
##   TRUE         39    6     4
table(filter_by_control)
## filter_by_control
## FALSE  TRUE 
##    49   719
# Filter data
mcasmall$use <- (
  # sufficient features (genes)
  filter_by_expr_features &
    # sufficient molecules counted
    filter_by_total_counts &
    # controls shouldn't be used in downstream analysis
    filter_by_control
    )
table(mcasmall$use)
## 
## FALSE  TRUE 
##   151   617
############MCA MEDIUM
# Filter cells with low counts
filter_by_total_counts <- (mcamedium$total_counts > 2500)
table(filter_by_total_counts)
## filter_by_total_counts
## FALSE  TRUE 
##     4   421
# Filter cells with low numbers of features detected
filter_by_expr_features <- (mcamedium$total_features > 500)
table(filter_by_expr_features)
## filter_by_expr_features
## FALSE  TRUE 
##     8   417
# filter out control samples
filter_by_control <- colData(mcamedium)$is_control == FALSE
table(colData(mcamedium)$is_control, colData(mcamedium)$ShortenedLifeStage2)
##        
##         bbSpz Schizont Trophozoite
##   FALSE   110      246          67
##   TRUE      2        0           0
table(filter_by_control)
## filter_by_control
## FALSE  TRUE 
##     2   423
# Filter data
mcamedium$use <- (
  # sufficient features (genes)
  filter_by_expr_features &
    # sufficient molecules counted
    filter_by_total_counts &
    # controls shouldn't be used in downstream analysis
    filter_by_control
)
table(mcamedium$use, mcamedium$ShortenedLifeStage2)
##        
##         bbSpz Schizont Trophozoite
##   FALSE     8        0           0
##   TRUE    104      246          67
table(mcasmall$use, mcasmall$ShortenedLifeStage)
##        
##         Merozoite Ring sgSpz
##   FALSE        59   53    39
##   TRUE        229  235   153
############MCA LARGE
# Filter cells with low counts
filter_by_total_counts <- (mcalarge$total_counts > 2500)
table(filter_by_total_counts)
## filter_by_total_counts
## FALSE  TRUE 
##    75   778
# Filter cells with low numbers of features detected
filter_by_expr_features <- (mcalarge$total_features > 1000)
table(filter_by_expr_features)
## filter_by_expr_features
## FALSE  TRUE 
##    93   760
# filter out control samples
filter_by_control <- colData(mcalarge)$is_control == FALSE
table(colData(mcalarge)$is_control, colData(mcalarge)$ShortenedLifeStage2)
##        
##         EEF Female Male oocyst ook ookoo
##   FALSE 186    129   75    133 129   188
##   TRUE    3      0    0      0   6     4
table(filter_by_control)
## filter_by_control
## FALSE  TRUE 
##    13   840
# Filter data
mcalarge$use <- (
  # sufficient features (genes)
  filter_by_expr_features &
    # sufficient molecules counted
    filter_by_total_counts &
    # controls shouldn't be used in downstream analysis
    filter_by_control
)
table(mcamedium$use, mcamedium$ShortenedLifeStage)
##        
##         bbSpz Shz
##   FALSE     8   0
##   TRUE    104 313
table(mcasmall$use, mcasmall$ShortenedLifeStage)
##        
##         Merozoite Ring sgSpz
##   FALSE        59   53    39
##   TRUE        229  235   153
table(mcalarge$use, mcalarge$ShortenedLifeStage)
##        
##         EEF oocyst ook ookoo Shz
##   FALSE  30     27  32     8   3
##   TRUE  159    106 103   184 201
mca <- cbind(mcasmall, mcamedium)
mca <- cbind(mca, mcalarge)
tab2 <- table(mca$use, mca$ShortenedLifeStage2)

#make QCed SingleCellExperiment
mca.qc.cells <- mca[ , colData(mca)$use]
meds <- tapply(colData(mca.qc.cells)$total_features, colData(mca.qc.cells)$ShortenedLifeStage2, median)
meds
##       bbSpz         EEF      Female        Male   Merozoite      oocyst 
##       868.0      3226.0      2528.0      1584.0       200.0      3317.5 
##         ook       ookoo        Ring    Schizont       sgSpz Trophozoite 
##      1638.0      3104.0       460.0       938.0       448.0      2225.0
write.csv(meds, file="QCediantotalfeatures2.csv")
table <- table(mca.qc.cells$use, mca.qc.cells$ShortenedLifeStage2)
table
##       
##        bbSpz EEF Female Male Merozoite oocyst ook ookoo Ring Schizont
##   TRUE   104 159    127   74       229    106 103   184  235      246
##       
##        sgSpz Trophozoite
##   TRUE   153          67
info <- as.data.frame(colData(mca.qc.cells))
ggplot(info, aes(x=ShortenedLifeStage2, total_features)) + geom_violin() +  coord_flip() + stat_summary(fun.y = median, geom = "point", size=2, color="red") + theme_minimal()

write.csv(tab2, file = "cellcounts2.csv", quote = FALSE)
#Identifying outliers by PCA


plotPCA(mca.qc.cells,
        size_by = "total_features",
        colour_by = "seqrunnum", 
        pca_data_input = "colData")

#Looks at the distribution of reads for expression levels of genes
#scater::plotQC(mca, type = "highest-expression")

# Gene filtering (Best to do cells first to get rid of genes only expressed in crappy cells)
# e.g. greater than 1 reads in at least 2 cells
filter_genes <- apply(counts(mca[ , colData(mca)$use]), 1, function(x) length(x[x >= 1]) >= 2)

table(filter_genes)
## filter_genes
## FALSE  TRUE 
##    89  5156
rowData(mca)$use <- filter_genes

dim(mca[rowData(mca)$use, colData(mca)$use])
## [1] 5156 1787
assay(mca, "logcounts_raw") <- log2(counts(mca) + 1)
reducedDim(mca) <- NULL

#make final QCed SingleCellExperiment
mca.qc <- mca[rowData(mca)$use, colData(mca)$use]

saveRDS(mca.qc, file="mca.qc_20180625.rds")

Normalisation using TMM based on IDC, OOKOO, EEF, SPZ, SEX

mca.qc.ookoo <- mca.qc[, (colData(mca.qc)$ShortenedLifeStage == "ook") | (colData(mca.qc)$ShortenedLifeStage == "ookoo") | (colData(mca.qc)$ShortenedLifeStage == "oocyst")]
mca.qc.eef <- mca.qc[, (colData(mca.qc)$ShortenedLifeStage == "EEF")]
mca.qc.spz <- mca.qc[, (colData(mca.qc)$ShortenedLifeStage == "bbSpz") | 
                          (colData(mca.qc)$ShortenedLifeStage == "sgSpz") |
                          (colData(mca.qc)$ShortenedLifeStage == "ooSpz")]
mca.qc.idc <- mca.qc[, (colData(mca.qc)$ShortenedLifeStage == "Merozoite") | 
                       (colData(mca.qc)$ShortenedLifeStage == "Ring") |
                       (colData(mca.qc)$ShortenedLifeStage2 == "Schizont") | 
                       (colData(mca.qc)$ShortenedLifeStage2 == "Trophozoite")]
mca.qc.sex <- mca.qc[, (colData(mca.qc)$ShortenedLifeStage2 == "Male") | (colData(mca.qc)$ShortenedLifeStage2 == "Female")]


mca.qc.ookoo.tmm <- scater::normaliseExprs(mca.qc.ookoo, method = "TMM")
#mca.qc.ooc.tmm <- scater::normaliseExprs(mca.qc.ooc, method = "TMM")
mca.qc.eef.tmm <- scater::normaliseExprs(mca.qc.eef, method = "TMM")
mca.qc.spz.tmm <- scater::normaliseExprs(mca.qc.spz, method = "TMM")
mca.qc.idc.tmm <- scater::normaliseExprs(mca.qc.idc, method = "TMM")
mca.qc.sex.tmm <- scater::normaliseExprs(mca.qc.sex, method = "TMM")

mca.qc.tmm <- cbind(mca.qc.ookoo.tmm, mca.qc.eef.tmm)
#mca.qc.tmm <- cbind(mca.qc.tmm, mca.qc.eef.tmm)
mca.qc.tmm <- cbind(mca.qc.tmm, mca.qc.spz.tmm)
mca.qc.tmm <- cbind(mca.qc.tmm, mca.qc.idc.tmm)
mca.qc.tmm <- cbind(mca.qc.tmm, mca.qc.sex.tmm)

pca <- plotPCA(mca.qc.tmm, colour_by = "ShortenedLifeStage2", exprs_values = "logcounts", ntop = 50)
pcatab <- pca$data
ggplot(pcatab, aes(PC1, PC2)) + geom_point(aes(colour=factor(colour_by))) + scale_color_manual(values = cols) + theme_classic()

saveRDS(mca.qc.tmm, file="MCAqcTMM_20180625.rds")
mca.qc.tmm_counts <- counts(mca.qc.tmm)
mca.qc.tmm_normcounts <- normcounts(mca.qc.tmm)
normlogcounts <- log2(normcounts(mca.qc.tmm) + 1)
qcpheno <- colData(mca.qc.tmm)

write.csv(qcpheno, file = "QC_pheno_20180625.csv", quote = FALSE, row.names = FALSE)
write.csv(mca.qc.tmm_counts, file = "mca.qc.tmm_counts_20180625.csv", quote = FALSE)
write.csv(mca.qc.tmm_normcounts, file = "mca.qc.tmm_norm_counts_20180625.csv", quote = FALSE)
write.csv(normlogcounts, file = "mca.qc.tmm_norm_log_counts_20180625.csv", quote = FALSE)


pca <- plotPCA(mca.qc.tmm, exprs_values = "logcounts", ntop = 500, colour_by = "ShortenedLifeStage2", ncomp=3)
dat <- pca$data
plot_ly(dat, 
        x           = ~PC1, 
        y           = ~PC2, 
        z           = ~PC3,
        type        = 'scatter3d', 
        mode        = 'markers',
         #hoverinfo   = "text",
        #text       = ~paste0(gene_id, " ", gene_name),
        color      = ~colour_by,
        colors     = c("bbSpz" = "navy", "EEF"="darkorange", "Merozoite"="lightpink", "oocyst"="steelblue", "ook" = "turquoise4", "ooSpz" = "lightskyblue", "Ring"="hotpink", "sgSpz"= "royalblue", "Schizont" = "violetred", "Male"="purple", "Female"="purple4", "ookoo" = "mediumturquoise", "Trophozoite"="violet"),
        marker      = list(size = 3)
)

Normalize with TMM on everything together

mca.qc.tmmall <- scater::normaliseExprs(mca.qc, method = "TMM")
pca <- plotPCA(mca.qc.tmmall, colour_by = "ShortenedLifeStage2", exprs_values = "logcounts", ntop = 50)
pcatab <- pca$data
ggplot(pcatab, aes(PC1, PC2)) + geom_point(aes(colour=factor(colour_by))) + scale_color_manual(values = cols) + theme_classic()

saveRDS(mca.qc.tmmall, file="MCAqcTMMall_20180625.rds")

sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2             plotly_4.7.1              
##  [3] scater_1.6.3               SingleCellExperiment_1.0.0
##  [5] SummarizedExperiment_1.8.1 DelayedArray_0.4.1        
##  [7] matrixStats_0.53.1         GenomicRanges_1.30.3      
##  [9] GenomeInfoDb_1.14.0        IRanges_2.12.0            
## [11] S4Vectors_0.16.0           ggplot2_2.2.1             
## [13] Biobase_2.38.0             BiocGenerics_0.24.0       
## [15] Matrix_1.2-14             
## 
## loaded via a namespace (and not attached):
##  [1] viridis_0.5.1          httr_1.3.1             tidyr_0.8.1           
##  [4] edgeR_3.20.9           jsonlite_1.5           bit64_0.9-7           
##  [7] viridisLite_0.3.0      shiny_1.1.0            assertthat_0.2.0      
## [10] highr_0.6              blob_1.1.1             vipor_0.4.5           
## [13] GenomeInfoDbData_1.0.0 yaml_2.1.18            progress_1.1.2        
## [16] pillar_1.2.1           RSQLite_2.1.0          backports_1.1.2       
## [19] lattice_0.20-35        glue_1.2.0             limma_3.34.9          
## [22] digest_0.6.15          promises_1.0.1         XVector_0.18.0        
## [25] colorspace_1.3-2       cowplot_0.9.2          htmltools_0.3.6       
## [28] httpuv_1.4.3           plyr_1.8.4             XML_3.98-1.11         
## [31] pkgconfig_2.0.1        biomaRt_2.34.2         zlibbioc_1.24.0       
## [34] purrr_0.2.5            xtable_1.8-2           scales_0.5.0          
## [37] later_0.7.2            tibble_1.4.2           lazyeval_0.2.1        
## [40] magrittr_1.5           mime_0.5               memoise_1.1.0         
## [43] evaluate_0.10.1        beeswarm_0.2.3         shinydashboard_0.7.0  
## [46] tools_3.4.4            data.table_1.10.4-3    prettyunits_1.0.2     
## [49] stringr_1.3.1          locfit_1.5-9.1         munsell_0.4.3         
## [52] AnnotationDbi_1.40.0   compiler_3.4.4         rlang_0.2.0.9001      
## [55] rhdf5_2.22.0           grid_3.4.4             RCurl_1.95-4.10       
## [58] tximport_1.6.0         htmlwidgets_1.2        rjson_0.2.15          
## [61] crosstalk_1.0.0        labeling_0.3           bitops_1.0-6          
## [64] rmarkdown_1.9          gtable_0.2.0           DBI_1.0.0             
## [67] reshape2_1.4.3         R6_2.2.2               gridExtra_2.3         
## [70] knitr_1.20             dplyr_0.7.5            bit_1.1-12            
## [73] bindr_0.1.1            rprojroot_1.3-2        ggbeeswarm_0.6.0      
## [76] stringi_1.2.2          Rcpp_0.12.17           tidyselect_0.2.4